CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
C     pipelined.f
C
C     This program uses a four-point stencil to smooth a 1,000 by 1,000
C     array.  The smoothing is done in parallel using domain decomposition
C     and iterative relaxation.  That is, all tasks are given a fixed portion
C     of the problem domain -- part of the array -- over which they do the
C     smoothing.  At the end of each iteration they exchange borders.  To
C     simplify the code and get better efficiency, each task maintains a
C     copy of each neighbor's border in "ghost cells".  Ghost cells are an
C     extra row or column added to each side of the task's stencil array,
C     used to hold neighboring values as input to the computation.
C
C     The calculation is done in the same manner as for the "skewed"
C     version, with the exception that global error is propagated
C     asynchronously via point-to-point message sends rather than a
C     global, synchronous operation like MPI_Allreduce.  This allows
C     multiple waves of calculations to take place, while still maintaining
C     the same order of calculation as in the original serial version.
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
      PROGRAM MAIN
      IMPLICIT NONE
      INTEGER prob_size
      REAL close_enough
      PARAMETER (prob_size=1000, close_enough=0.1)
      INCLUDE 'stencil.h'
      REAL stencil
      COMMON stencil(0:prob_size-1,0:prob_size-1)

      CALL init_comm(prob_size)
      IF (my_task.EQ.0) PRINT *, "initializing the array."
      CALL init_stencil(stencil, prob_size, task_rows, task_cols)
      IF (my_task.EQ.0) PRINT *, "computing the stencil."
      CALL compute_stencil(stencil, task_rows, task_cols, close_enough)
      IF (my_task.EQ.0) PRINT *, "an answer was found."
      CALL term_comm()
      IF (my_task.EQ.0) PRINT *, "ending the program."

      END


CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
C     init_comm
C
C     This routine brings up MPI and sets up all of the values needed to
C     successfully and efficiently do the smoothing operation in parallel.
C     Most variables used here are described in the .h file for this
C     program.
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
      SUBROUTINE init_comm(prob_size)
      IMPLICIT NONE
      INTEGER prob_size
      INCLUDE 'mpif.h'
      INCLUDE 'stencil.h'

      INTEGER corner(grid_rank)

      CALL MPI_Init(ierror)
      CALL MPI_Comm_size(MPI_Comm_world, num_tasks, ierror)
      CALL MPI_Comm_rank(MPI_Comm_world, my_task, ierror)
      dims(1) = sqrt(real(num_tasks))
      dims(2) = num_tasks / dims(1)
      wrap(1) = .false.
      wrap(2) = .false.
      CALL MPI_Cart_create(MPI_Comm_world, grid_rank, dims, wrap,
     1   .true., my_comm, ierror)
      CALL MPI_Cart_shift(my_comm, 0, 1, n_task, s_task, ierror)
      CALL MPI_Cart_shift(my_comm, 1, 1, w_task, e_task, ierror)
      CALL MPI_Cart_get(my_comm, 2, dims, wrap, coords, ierror)

      rowwidth  = 1 + (prob_size - 2 - 1) / dims(1)
      colwidth  = 1 + (prob_size - 2 - 1) / dims(2)
      row_st    = 1 + coords(1) * rowwidth
      row_end   = MIN(prob_size - 2, row_st + rowwidth - 1)
      col_st    = 1 + coords(2) * colwidth
      col_end   = MIN(prob_size - 2, col_st + colwidth - 1)
      task_rows = 2 + 1 + row_end - row_st
      task_cols = 2 + 1 + col_end - col_st

      CALL MPI_Type_vector(task_cols-2, 1, task_rows, MPI_Real,
     1   row_type, ierror)
      CALL MPI_Type_vector(task_rows-2, 1,         1, MPI_Real,
     1   col_type, ierror)
      CALL MPI_Type_commit(row_type, ierror)
      CALL MPI_Type_commit(col_type, ierror)

      corner(1) =  dims(1) - 1
      corner(2) =  dims(2) - 1
      CALL MPI_Cart_rank(my_comm, corner, se_task, ierror)
      corner(1) =  0
      corner(2) =  0
      CALL MPI_Cart_rank(my_comm, corner, nw_task, ierror)

      END


CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
C     init_stencil
C
C     This routine reads in the initial values for the array over which
C     smoothing will occur.  Since the program is executed in parallel,
C     and this task operates on only a portion of the whole array, only
C     those records which are needed are read in.  But since Fortran
C     does not offer the ability to read in partial records, the whole
C     record is read into a buffer, then the useful part is copied into
C     the array.
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
      SUBROUTINE init_stencil(stencil, prob_size, m, n)
      IMPLICIT NONE
      INCLUDE 'stencil.h'
      INTEGER prob_size, m, n, i, j
      REAL stencil(0:m-1, 0:n-1)
      REAL buffer(0:prob_size-1)

      OPEN(UNIT=1, FILE="stencil.dat", STATUS="OLD", ACTION="READ",
     1   FORM="UNFORMATTED", ACCESS="DIRECT", RECL=prob_size*4)
      DO j=col_st-1, col_end+1
         READ (UNIT=1, REC=j+1) (buffer(i), i=0, prob_size-1)
         DO i=0, m-1
            stencil(i, j-(col_st-1)) = buffer(i + row_st - 1)
         END DO
      END DO
      CLOSE(UNIT=1)

      END


CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
C     compute_stencil
C
C     This routine smooths the values of the array.  The routine first
C     exchanges the borders it shares with its northern and western
C     neighbors, then it sweeps over consecutive rows of its portion of
C     the array until it reaches the end.  Next it exchanges borders 
C     with its southern and eastern neighbors.  This is all the same as
C     for the "skewed" version.
C
C     The synchronous global reduce operation, which determined whether
C     the global error exceeded a threshhold, has been replaced with
C     asynchronous point-to-point sends and receives which can be found
C     in the two exchange routines.  Also, the routine relies much more
C     heavily on the local error to decide whether to continue with the
C     next iteration or not.  In effect, the northwest corner task keeps
C     cycling through iterations as fast as it can compute them and
C     exchange borders with its neighbors, until its own local error drops
C     below the threshhold.  Only then does it check whether the global
C     error is above the threshhold to decide whether to begin the next
C     iteration.
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
      SUBROUTINE compute_stencil(stencil, m, n, close_enough)
      IMPLICIT NONE
      INCLUDE 'mpif.h'
      INCLUDE 'stencil.h'
      INTEGER m, n
      REAL stencil(0:m-1, 0:n-1)
      REAL close_enough
      REAL local_err, global_err, old_val
      INTEGER i, j, iter_count

      iter_count = 0
      local_err = close_enough + 1
 100  CONTINUE
	 iter_count = iter_count + 1
 	 CALL exch_in(stencil, m, n, local_err, global_err,
     1      iter_count, close_enough)

	 IF (MAX(global_err,local_err).GE.close_enough) THEN
            local_err = 0.0
	    DO j=1, n-2
	       DO i=1, m-2
	          old_val = stencil(i,j)

	          stencil(i,j) = ( stencil( i-1, j ) +
     1                             stencil( i+1, j ) +
     2                             stencil( i ,j-1) +
     3                             stencil( i ,j+1) ) / 4

                  local_err = MAX(local_err, ABS(old_val-stencil(i,j)))
	       END DO
	    END DO
	 END IF

 	 CALL exch_out(stencil, m, n, global_err, local_err)

         IF(MOD(iter_count,100).EQ.0)PRINT *, iter_count, global_err
      IF (MAX(global_err,local_err).GE.close_enough) GOTO 100
      PRINT *, "convergence reached after", iter_count, "iterations."

      END


CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
C     exch_in
C
C     This routine exchanges two of the four borders, north and west,
C     placing the incoming values in the appropriate "ghost cells".  It
C     first initiates both receives as asynchronous receives, then both
C     sends as asynchronous sends, then it waits for all sends and
C     receives to complete.
C
C     It also gets the global error and accumulated local error from 
C     the north and west.  If the task executing the code is the 
C     northwest corner task, it receives any global error messages from
C     the southeast that may have arrived.  Next it checks whether its
C     local error is sufficient to merit another iteration.  If it is
C     not, it waits until the most recent global error message arrives,
C     and checks the global error against the threshhold.
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
      SUBROUTINE exch_in(stencil, m, n, local_err, global_err,
     1      iter_count, close_enough)
      IMPLICIT NONE
      INCLUDE 'mpif.h'
      INCLUDE 'stencil.h'
      INTEGER m, n
      REAL stencil(0:m-1, 0:n-1), local_err, global_err, close_enough
      INTEGER request(4), my_status(MPI_Status_size), i
      INTEGER iter_count, se_iter, from_se
      LOGICAL recv_complete
      SAVE se_iter, from_se
      DATA se_iter / 0 /, from_se / MPI_Request_null /
      REAL buffer(2)
      SAVE buffer

      buffer(1) = 0
      buffer(2) = 0
      IF (my_task.EQ.nw_task) THEN
	 n_err = 0
	 w_err = 0
 100     CONTINUE
         CALL MPI_Test(from_se, recv_complete, my_status, ierror)
         IF (recv_complete) GOTO 101
            CALL MPI_Irecv(global_err, 1, MPI_Real, se_task, 0,
     1         my_comm, request, ierror)
	    se_iter = se_iter + 1
	    GOTO 100
 101     CONTINUE
	 IF (local_err.GE.close_enough) THEN
	    global_err = local_err
	 ELSE
 102        CONTINUE
	    IF (se_iter.GE.iter_count-1) GOTO 103
***            receive global from se corner
               CALL MPI_Recv(global_err, 1, MPI_Real, se_task, 0,
     1            my_comm, my_status, ierror)
	       se_iter = se_iter + 1
	       GOTO 102
 103        CONTINUE
	 END IF
      ELSE
***      receive global, n_err from n
         CALL MPI_Recv(buffer, 2, MPI_Real, n_task, 0,
     1      my_comm, my_status, ierror)
         n_err = buffer(1)
	 global_err = buffer(2)
***      receive global, w_err from w
         CALL MPI_Recv(buffer, 2, MPI_Real, w_task, 0,
     1      my_comm, my_status, ierror)
         w_err = buffer(1)
	 global_err = buffer(2)
      END IF

      CALL MPI_Irecv(stencil(  0,  1), 1, row_type, n_task, 0,
     1   my_comm, request(1), ierror)
      CALL MPI_Irecv(stencil(  1,  0), 1, col_type, w_task, 0,
     1   my_comm, request(2), ierror)

      CALL MPI_Isend(stencil(  1,  1), 1, row_type, n_task, 0,
     1   my_comm, request(3), ierror)
      CALL MPI_Isend(stencil(  1,  1), 1, col_type, w_task, 0,
     1   my_comm, request(4), ierror)

      DO i=1, 4
         CALL MPI_Wait(request(i), my_status, ierror)
      END DO

      END


CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
C     exch_out
C
C     This routine exchanges two of the four borders, south and east,
C     placing the incoming values in the appropriate "ghost cells".  It
C     first initiates both receives as asynchronous receives, then both
C     sends as asynchronous sends, then it waits for all sends and
C     receives to complete.
C
C     If the task executing this routine is the southeast corner task,
C     it also sends the accumulated error to the northwest corner task.
C     Otherwise it sends the accumulated error to the south and east,
C     along with the global error from the previous iteration.
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
      SUBROUTINE exch_out(stencil, m, n, global_err, local_err)
      IMPLICIT NONE
      INCLUDE 'mpif.h'
      INCLUDE 'stencil.h'
      INTEGER m, n
      REAL stencil(0:m-1, 0:n-1), global_err, local_err
      INTEGER request(4), my_status(MPI_Status_size), i
      REAL buffer(2)
      SAVE request, buffer
      DATA request / 4*0 /

      buffer(1) = MAX(local_err, n_err, w_err)
      buffer(2) = global_err

      IF(my_task.EQ.se_task) THEN
         CALL MPI_Send(buffer, 1, MPI_Real, nw_task, 0,
     1        my_comm, ierror)
      ELSE
         CALL MPI_Send(buffer, 2, MPI_Real, s_task, 0,
     1        my_comm, ierror)
         CALL MPI_Send(buffer, 2, MPI_Real, e_task, 0,
     1        my_comm, ierror)
      END IF

      CALL MPI_Irecv(stencil(m-1,  1), 1, row_type, s_task, 0,
     1   my_comm, request(1), ierror)
      CALL MPI_Irecv(stencil(  1,n-1), 1, col_type, e_task, 0,
     1   my_comm, request(2), ierror)

      CALL MPI_Isend(stencil(m-2,  1), 1, row_type, s_task, 0,
     1   my_comm, request(3), ierror)
      CALL MPI_Isend(stencil(  1,n-2), 1, col_type, e_task, 0,
     1   my_comm, request(4), ierror)

      DO i=1, 4
         CALL MPI_Wait(request(i), my_status, ierror)
      END DO

      END


CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
C     term_comm
C
C     This routine frees the MPI message defined types and terminates MPI.
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
      SUBROUTINE term_comm()
      IMPLICIT NONE
      INCLUDE 'stencil.h'

      CALL MPI_Type_free(col_type, ierror)
      CALL MPI_Type_free(row_type, ierror)
      CALL MPI_Finalize(ierror)

      END
